{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from collections import Counter\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print(trainset.target_names)\n",
    "print(len(trainset.data))\n",
    "print(len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 197234\n"
     ]
    }
   ],
   "source": [
    "texts = ' '.join(trainset.data)\n",
    "words = texts.split()\n",
    "word2freq = Counter(words)\n",
    "print(\"Total words:\", len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size: 20465\n"
     ]
    }
   ],
   "source": [
    "_words = set(words)\n",
    "word2idx = {c: i for i, c in enumerate(_words)}\n",
    "idx2word = {i: c for i, c in enumerate(_words)}\n",
    "vocab_size = len(idx2word)\n",
    "indexed = [word2idx[w] for w in words]\n",
    "print('Vocabulary size:', vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SKIPGRAM:\n",
    "    def __init__(self, sample_size, vocab_size, embedded_size, window_size=3):\n",
    "        self.X = tf.placeholder(tf.int32, shape=[None])\n",
    "        self.Y = tf.placeholder(tf.int32, shape=[None, 1])\n",
    "        self.embedding = tf.Variable(tf.truncated_normal([vocab_size, embedded_size],\n",
    "                                                      stddev=1.0 / np.sqrt(embedded_size)))\n",
    "        self.bias = tf.Variable(tf.zeros([vocab_size]))\n",
    "        embedded = tf.nn.embedding_lookup(self.embedding, self.X)\n",
    "        self.cost = tf.reduce_mean(tf.nn.nce_loss(\n",
    "            weights=self.embedding,\n",
    "            biases=self.bias,\n",
    "            labels=self.Y,\n",
    "            inputs=embedded,\n",
    "            num_sampled=sample_size,\n",
    "            num_classes=vocab_size))\n",
    "        self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cost)\n",
    "        self.valid_dataset = tf.placeholder(tf.int32, shape=[None])\n",
    "        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keep_dims=True))\n",
    "        normalized_embeddings = self.embedding / norm\n",
    "        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, self.valid_dataset)\n",
    "        self.similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedded_size = 128\n",
    "skip_window = 5\n",
    "epoch = 10\n",
    "valid_size = 10\n",
    "nearest_neighbors = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = SKIPGRAM(batch_size,vocab_size,embedded_size)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_y(words, idx):\n",
    "    skipped = np.random.randint(1, skip_window+1)\n",
    "    left = idx - skip_window if (idx - skipped) > 0 else 0\n",
    "    right = idx + skipped\n",
    "    y = words[left: idx] + words[idx+1: right+1]\n",
    "    return list(set(y))\n",
    "\n",
    "def make_xy(int_words):\n",
    "    x,y = [], []\n",
    "    for i in range(0, len(int_words)):\n",
    "        input_w = int_words[i]\n",
    "        labels = get_y(int_words, i)\n",
    "        x.extend([input_w] * len(labels))\n",
    "        y.extend(labels)\n",
    "    return np.array(x), np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = make_xy(indexed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, avg loss 53.344921\n",
      "Nearest to its: and, it, of, a, like, the, is, characters,\n",
      "Nearest to no: that, for, new, vague, premise, will, unfortunate, how,\n",
      "Nearest to the: of, and, is, to, it, a, as, its,\n",
      "Nearest to the: of, and, is, to, it, a, as, its,\n",
      "Nearest to are: there, some, just, script, before, dull, itself, story,\n",
      "Nearest to of: and, the, a, is, it, more, movie, its,\n",
      "Nearest to theres: something, vague, never, done, definitely, no, then, will,\n",
      "Nearest to excesses: passionately, affirming, trama, judgment, pays, naturalistic, climate, kick,\n",
      "Nearest to by: going, just, vague, that, making, movie, something, while,\n",
      "Nearest to a: and, of, it, is, to, that, the, its,\n",
      "epoch 2, avg loss 5.897086\n",
      "Nearest to characters: than, plot, movie, doing, story, it, better, is,\n",
      "Nearest to correctly: carousel, wagers, glint, lightens, noisy, rohmer, breathing, cineasts,\n",
      "Nearest to the: but, movie, of, to, its, as, it, is,\n",
      "Nearest to piece: none, crap, dramatic, work, emotionally, guy, also, sometimes,\n",
      "Nearest to was: that, what, made, this, movie, but, right, rather,\n",
      "Nearest to between: some, emotionally, difference, relationship, dreary, sex, few, steven,\n",
      "Nearest to each: other, unexpected, scary, forgotten, still, annoying, simply, horrible,\n",
      "Nearest to to: it, of, its, and, a, this, is, but,\n",
      "Nearest to it: to, but, has, all, is, of, this, and,\n",
      "Nearest to but: it, has, still, to, that, is, right, one,\n",
      "epoch 3, avg loss 3.653209\n",
      "Nearest to completely: forgettable, anywhere, promising, awful, lacking, fight, success, executed,\n",
      "Nearest to of: its, a, the, and, is, sense, out, one,\n",
      "Nearest to aspires: mars, bankrupt, brightly, enemy, unpleasant, ended, choose, unattractive,\n",
      "Nearest to fool: pinocchio, rent, sadly, swear, climax, fill, wooden, puzzling,\n",
      "Nearest to is: a, the, its, and, this, writing, of, joke,\n",
      "Nearest to on: based, goes, rock, going, use, frustrating, empty, for,\n",
      "Nearest to encouraging: bringing, physically, delivering, absurdity, tormented, added, shyamalan, kicking,\n",
      "Nearest to films: actors, this, movie, one, rarely, all, film, memory,\n",
      "Nearest to somehow: managed, lines, god, loud, which, successful, given, attention,\n",
      "Nearest to underrated: jagjit, joyful, anytime, irreverence, pasty, adorns, hushed, haveyourselfahappylittleholocaust,\n",
      "epoch 4, avg loss 3.168016\n",
      "Nearest to entertaining: mildly, thoughtful, certainly, but, substance, more, unpleasant, thoughtprovoking,\n",
      "Nearest to find: yourself, may, viewers, others, themselves, happen, will, youll,\n",
      "Nearest to sublime: distanced, disease, tapestry, crudely, taymor, hinge, polson, puro,\n",
      "Nearest to in: execution, terms, 2002, view, did, fact, wrong, alone,\n",
      "Nearest to produced: written, directed, extreme, disney, searching, low, even, faster,\n",
      "Nearest to an: hour, intriguing, lazy, drugs, excellent, admirable, general, stunt,\n",
      "Nearest to become: presence, happens, spielberg, cult, grows, memory, disappointed, turned,\n",
      "Nearest to end: until, result, fact, feeling, before, the, in, we,\n",
      "Nearest to gravitas: expiry, thousandtimes, dupe, malas, katherine, ditties, mccrudden, melville,\n",
      "Nearest to manner: flat, use, background, seems, relentlessly, pictures, buried, laughable,\n",
      "epoch 5, avg loss 2.960772\n",
      "Nearest to in: exercise, scene, terms, which, trouble, 2002, today, almost,\n",
      "Nearest to despite: flaws, talents, some, virtues, excellent, firstrate, efforts, comedian,\n",
      "Nearest to time: running, waste, long, good, saw, spent, idea, much,\n",
      "Nearest to schiffer: onion, enforcement, veces, camouflaging, happilyever, genialrogue, incoloro, daunting,\n",
      "Nearest to as: serves, thinks, so, begins, a, flat, nor, really,\n",
      "Nearest to feels: like, pilot, hokey, product, involving, incredibly, shelf, looks,\n",
      "Nearest to a: film, flat, single, as, bore, wellmade, deliver, is,\n",
      "Nearest to documentary: endearing, sad, buy, effective, absorbing, nohes, poignant, british,\n",
      "Nearest to screenplay: unfocused, uninspired, direction, obviously, violence, manipulative, trite, bland,\n",
      "Nearest to brain: check, matinee, hook, pleasant, failure, scratching, your, distance,\n",
      "epoch 6, avg loss 2.832360\n",
      "Nearest to seeing: worth, rooting, unless, feel, taking, talking, intelligent, forever,\n",
      "Nearest to summer: diversion, hopefully, entertainment, stuff, plenty, parts, sophisticated, x,\n",
      "Nearest to insanely: newcomer, train, portrays, chief, brisk, status, blackandwhite, chaos,\n",
      "Nearest to is: hoot, about, story, damn, title, its, a, result,\n",
      "Nearest to along: pace, weight, way, long, plods, forgive, drags, silliness,\n",
      "Nearest to in: 2002, scene, right, hour, this, form, wrong, movie,\n",
      "Nearest to convention: loquacious, demise, doses, spacey, stylings, oeuvre, pep, quiz,\n",
      "Nearest to content: provoked, sessions, terms, condescending, disappointingly, 51, mixture, downbeat,\n",
      "Nearest to that: didnt, i, suspect, wish, so, begin, thought, admit,\n",
      "Nearest to still: 1982, decide, later, ask, ripe, groove, telephone, dramas,\n",
      "epoch 7, avg loss 2.745046\n",
      "Nearest to impeccable: timing, strong, respectable, overcome, addresses, flawless, flourishes, comic,\n",
      "Nearest to sympathetic: properly, main, superior, startling, male, defies, damn, thinks,\n",
      "Nearest to to: supposed, attention, tries, bring, listen, willing, try, wants,\n",
      "Nearest to hilarious: sparkling, romantic, comedy, frequently, utterly, followup, jealousy, inviting,\n",
      "Nearest to mail: unidimensional, guided, mccrudden, ismail, eisenbergs, sliding, tactic, allwoman,\n",
      "Nearest to nicholas: nickleby, douglas, dickens, refreshed, mcgraths, cliffsnotes, ralph, version,\n",
      "Nearest to does: job, return, pays, persuasive, convincing, deliver, resembling, rabbitproof,\n",
      "Nearest to plot: holes, twists, hopelessly, moments, hot, bland, track, gaping,\n",
      "Nearest to gifted: marvelous, won, relentless, cat, hightech, alan, lightest, acquainted,\n",
      "Nearest to gravy: fauxcontemporary, hudson, impeccably, wailing, tener, dooby, underarm, precipitously,\n",
      "epoch 8, avg loss 2.665462\n",
      "Nearest to moving: quietly, vibrant, example, genuinely, uplifting, poignant, reveals, portraying,\n",
      "Nearest to queens: nine, men, stealing, harvard, company, cross, hour, smell,\n",
      "Nearest to not: here, necessarily, mention, recommend, maybe, does, laugh, sure,\n",
      "Nearest to you: expect, want, decide, wanting, begin, chances, yourself, unless,\n",
      "Nearest to time: running, long, machine, spent, money, waste, spend, intermittently,\n",
      "Nearest to viewer: sleep, rest, patient, predominantly, drown, leaves, haunted, gimmick,\n",
      "Nearest to of: equivalent, embarrassment, body, pretends, 90, mix, dreary, observation,\n",
      "Nearest to pow: kung, fist, enter, embarrassment, mild, references, impostor, goodfellas,\n",
      "Nearest to in: terms, interested, hands, this, falters, 2002, dogs, idiots,\n",
      "Nearest to hidden: thick, invasion, schemes, klein, coos, comically, keeping, contest,\n",
      "epoch 9, avg loss 2.613762\n",
      "Nearest to of: the, loads, reign, bunch, run, one, sense, type,\n",
      "Nearest to que: culminando, filme, tono, esas, um, lo, ento, hora,\n",
      "Nearest to being: importance, trapped, able, perilously, malkovich, close, ends, necessarily,\n",
      "Nearest to love: triangle, unconditional, punchdrunk, praise, compassion, liza, longing, howard,\n",
      "Nearest to in: stage, terms, schneider, theater, exercise, hands, wake, speaking,\n",
      "Nearest to to: supposed, tries, exploit, willing, decide, pretends, try, decides,\n",
      "Nearest to and: clumsy, obligatory, meandering, paperthin, annoying, goodnatured, hopelessly, clever,\n",
      "Nearest to great: deal, premise, seriously, submarine, respectable, is, execution, bands,\n",
      "Nearest to just: kiss, plain, onto, another, cranky, figure, storyline, irrelevant,\n",
      "Nearest to chops: forum, ended, splendor, freeway, advantage, rodriguez, brains, their,\n",
      "epoch 10, avg loss 2.561526\n",
      "Nearest to a: tearjerker, sketch, hoot, grossout, disappointment, week, trifle, knockoff,\n",
      "Nearest to wayward: smarterthanthou, struggles, rebel, fosters, berlin, oppressive, precocious, messing,\n",
      "Nearest to of: the, loads, worst, reign, bunch, waste, type, vague,\n",
      "Nearest to to: willing, understand, attempt, tried, desperately, brought, exploit, succumb,\n",
      "Nearest to ever: wondered, rolled, brats, seen, whos, ive, fantasized, appreciated,\n",
      "Nearest to you: begin, wanting, swear, that, warning, convinced, wont, unless,\n",
      "Nearest to has: gaghan, scherfig, created, news, share, all, film, delivered,\n",
      "Nearest to the: of, worst, entire, dialogue, is, bottom, barrel, end,\n",
      "Nearest to mr: deeds, juicy, saldanha, bollyholly, conceived, lord, tidy, canvas,\n",
      "Nearest to schneider: rob, carvey, dana, riot, routines, michelle, scripted, catches,\n"
     ]
    }
   ],
   "source": [
    "for i in range(epoch):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(X.shape[0] // batch_size) * batch_size,batch_size):\n",
    "        batch_x = X[k:k+batch_size]\n",
    "        batch_y = Y[k:k+batch_size,np.newaxis]\n",
    "        cost,_ = sess.run([model.cost,model.optimizer],feed_dict={model.X:batch_x,\n",
    "                                                                 model.Y:batch_y})\n",
    "        total_cost += cost\n",
    "    total_cost /= (X.shape[0] // batch_size)\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))\n",
    "    random_valid_size = np.random.choice(indexed, valid_size)\n",
    "    similarity = sess.run(model.similarity,feed_dict={model.valid_dataset:random_valid_size})\n",
    "    for no, i in enumerate(random_valid_size):\n",
    "        valid_word = idx2word[i]\n",
    "        nearest = (-similarity[no, :]).argsort()[1:nearest_neighbors + 1]\n",
    "        log_str = 'Nearest to %s:' % valid_word\n",
    "        for k in range(nearest_neighbors):\n",
    "            close_word = idx2word[nearest[k]]\n",
    "            log_str = '%s %s,' % (log_str, close_word)\n",
    "        print(log_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
